In [1]:
import os
import sys

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

print sys.path

execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))


['/big/opt/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip', '/big/opt/spark-1.3.1/python', '', '/big/home/kent/git/pixnet_hackathon_2015', '/usr/lib/python2.7', '/usr/lib/python2.7/plat-linux2', '/usr/lib/python2.7/lib-tk', '/usr/lib/python2.7/lib-old', '/usr/lib/python2.7/lib-dynload', '/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages/PIL', '/usr/lib/python2.7/dist-packages/gtk-2.0', '/usr/lib/pymodules/python2.7', '/usr/local/lib/python2.7/dist-packages/IPython/extensions']
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.3.1
      /_/

Using Python version 2.7.3 (default, Mar 13 2014 11:03:55)
SparkContext available as sc, HiveContext available as sqlContext.

In [2]:
from pyspark.mllib.feature import Word2Vec
inp = sc.textFile('./data/new_parsed_no_spam.txt').map(lambda row: row.split(" "))
word2vec = Word2Vec()
model = word2vec.fit(inp)

In [33]:
pickle.dump(user_tags,open("./data/user_tags.pkl",'w'))

tag mapping


In [72]:
import numpy as np
import pickle
import jieba ,util
import csv ,json
jieba.load_userdict("./new.dict_all")
stop_words = util.load_stop_words('stopword.txt')
from pyspark.mllib.feature import Vectors
stop_words = util.load_stop_words('stopword.txt')


for line in open("./data/cookies_tags.csv.1"):#buf :
    cookie ,tags = line.split(',')
    b = Vectors.dense(np.zeros(100))
    count = 0
    for tag in tags.split(':') :
        if len(tag) > 4 :
            tags = jieba.cut(tag,cut_all = False)
            for item in tags :
                if item in stop_words: continue
                try:
                    b = b + model.transform(item)
                    count = count + 1
                except ValueError :
                    pass
        else :
            try:
                b = b + model.transform(tag)
                count = count + 1

            except ValueError :
                pass
    if count == 0 :
        user_tags[cookie] = b
    else :
        user_tags[cookie] = b/count

In [78]:
import csv
with open('./data/cookies_tags.csv.1.vec', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for cookie in user_tags :
        vec = user_tags[cookie]
        writer.writerow(vec)

In [65]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse the data
data = sc.textFile("./data/cookies_tags.csv.1.vec")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 10, maxIterations=1000,
        runs=100, initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model


Within Set Sum of Squared Error = 53627.2265609

In [66]:
c1 = clusters.clusterCenters[0]
count = 0 

for c1 in clusters.clusterCenters :
    print "cluster: " + str(count)
    count = count + 1
    synonyms = model.findSynonyms(c1, 10)
    for word, cosine_distance in synonyms:
        print("{}: {}".format(word.encode('utf-8'), cosine_distance))


cluster: 0
雪肌精: 0.787666022778
超效: 0.771770954132
柳晶凍: 0.767135679722
攜帶型: 0.766468405724
240ml: 0.766070902348
十勝特: 0.765772402287
濃膠: 0.762619614601
Kanebo: 0.76225143671
潔顏粉: 0.761189639568
15ML: 0.75984197855
cluster: 1
晶鑽桂馥: 0.846288323402
防曬隔離霜: 0.841037452221
超效: 0.840361237526
SPF10: 0.836658537388
隔離霜: 0.832134544849
OREAL: 0.831935405731
SPF20: 0.829072237015
Sofina: 0.828664958477
密粉: 0.826496899128
ORBIS: 0.826453268528
cluster: 2
日本藥妝: 0.875204265118
日本: 0.71024286747
大阪必買: 0.705878674984
藥妝店: 0.697661161423
電器行: 0.687138974667
motherways: 0.683901846409
好買: 0.683260202408
餐廚: 0.681079089642
必買: 0.66852504015
藥妝: 0.666158497334
cluster: 3
負離子吹風機: 0.859180152416
雪肌精: 0.855478584766
禮就: 0.821446239948
貴桑桑: 0.821358859539
快煮壺: 0.820593595505
千片: 0.816514968872
噴噴: 0.814507484436
Anya: 0.812125682831
Revive: 0.811225473881
采雪泡: 0.811107933521
cluster: 4
面霜: 0.831860780716
極潤: 0.829899013042
PH5: 0.829073607922
乳是: 0.82768446207
保濕: 0.825880289078
雪晶靈: 0.820730447769
FeeLife: 0.820348620415
水乳: 0.818243563175
安絲: 0.817625403404
活氧特潤: 0.815166950226
cluster: 5
減重: 0.729329884052
多喝水: 0.674757361412
孕期: 0.66714411974
節食: 0.652758598328
母乳: 0.646436452866
肥胖: 0.634298563004
奶量: 0.630047619343
滋補: 0.623379230499
害喜: 0.623227536678
調養: 0.619971513748
cluster: 6
蜂毒: 0.778626024723
攜帶型: 0.765002191067
Kanebo: 0.757535994053
240ml: 0.757527530193
十勝特: 0.756873607635
柳晶凍: 0.755248248577
濃膠: 0.75186342001
這罐: 0.751422226429
超效: 0.751118600368
潔顏粉: 0.750367939472
cluster: 7
這罐: 0.766361474991
保養品: 0.758870959282
柳晶凍: 0.754413187504
雪肌精: 0.753473579884
面膜: 0.752787768841
十勝特: 0.750624477863
肌研: 0.747960925102
BOBBI: 0.747652113438
攜帶型: 0.742332994938
施巴: 0.740555465221
cluster: 8
SPF10: 0.866022646427
蘭吉兒: 0.837427735329
防曬隔離霜: 0.836463272572
蛋肌: 0.834587812424
又潤: 0.834479570389
SPF35: 0.833531320095
banila: 0.832057952881
13g: 0.830287396908
一式: 0.826098024845
ORBIS: 0.821631968021
cluster: 9
Kanebo: 0.758593261242
攜帶型: 0.75580227375
蜂毒: 0.748610794544
眼藥水: 0.746614634991
潔顏粉: 0.742809653282
240ml: 0.742449998856
品是: 0.739953637123
紫草: 0.738954007626
日本必買: 0.737601041794
聖品: 0.735155880451

In [79]:
import csv
with open('./data/cookies_tags.csv.1.cluster', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for cookie in user_tags :
        vec = user_tags[cookie]
        label = clusters.predict(vec)
        l = list(vec)
        l.insert(0,label)
        l.insert(0,cookie)
        writer.writerow(l)

In [ ]: